In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import re
import os
#import nltk
from nltk.corpus import stopwords
import time

In [2]:
stopWords = stopwords.words('english')
months = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']
convertAlphaMonthsToNumeric = {months[i-1]:i for i in np.arange(1,13)}

In [3]:
directory = 'C:/Users/Krist/University College London/Spatio-Temporal Data Analysis and Data Mining/Data-Nav'
i = 0

for filename in os.listdir(directory):
    
    if i == 0:
        combinedData = gpd.read_file(directory+'/'+filename)
        date = re.split('[_.]',filename)[1]
        date = [date[i:i+2] for i in np.arange(0,len(date),2)]
        year = ''.join(['20'+date[0]])
        date = pd.DataFrame({
            'Year': np.repeat(year,combinedData.shape[0]),
            'Month': np.repeat(date[1],combinedData.shape[0]),
            'Day': np.repeat(date[2],combinedData.shape[0])})
        
        date = pd.DataFrame({'BroadcastingDate' : pd.to_datetime(date)})
        combinedData = pd.concat([combinedData,date],axis=1)
    else:
        data1 = gpd.read_file(directory+'/'+filename)
        date = re.split('[_.]',filename)[1]
        date = [date[i:i+2] for i in np.arange(0,len(date),2)]
        year = ''.join(['20'+date[0]])
        date = pd.DataFrame({
            'Year': np.repeat(year,data1.shape[0]),
            'Month': np.repeat(date[1],data1.shape[0]),
            'Day': np.repeat(date[2],data1.shape[0])})
        
        date = pd.DataFrame({'BroadcastingDate' : pd.to_datetime(date)})
        data1 = pd.concat([data1,date],axis=1)
        combinedData = pd.concat([combinedData,data1])
    i += 1

combinedData = combinedData.reset_index(drop=True)

In [4]:
start_1 = time.time()

indices = combinedData.index
# Defining the frames that we need to buiild
descriptionFrame = pd.DataFrame(index=indices, columns=['Area','Chart', 'Message','Authority','Date','StartDate',
                                                        'EndDate'])
coordinates = pd.DataFrame(index = indices, columns = ['Coordinates'],dtype = 'object')

# Setting the existing data up
broadcastingDate = pd.DataFrame(combinedData['BroadcastingDate'],index=indices,columns=['BroadcastingDate'])
geometryType = pd.DataFrame(combinedData['geometry'].geom_type,index=indices,columns=['GeometryType'])
geometry = pd.DataFrame(combinedData['geometry'],index = indices, columns = ['geometry'])


In [5]:
# This function needs a little bit of explanation. The outline of the function is based on the nature of the messages, and the
# the idea is simply to look for words/phrases that are associated with a start/end dates. There are two ways to detect the end
# date and one way to detect the start date, besides just using the broadcasting date as both start-/end-date.
# The word/phrase associated with the start-/end-dates are 'thru' and 'cancel this msg', and 'thru' can indicate both a start
# date and a end date, where 'cancel this msg' indicates an end date.
# The general structure of the messages is as follows: if 'thru' is present, the typical message goes something like 'day month
# thru day month', which allows us to estract both the start and end date. Howver, the messages vary greatly in structure
# and some list specific days using the word 'thru', an example being 'Wednesday thru Fridays, except holidays for the period
# day month thru day month', while other messages exclude the first month, an example being 'day thru day month'. There are 
# as such a lot of cases to consider, and the function belows assumes that the structure of the message is either 'day month
# thru day month' or 'day thru day month', and if that is not the case, use the broadcasting day as perhaps both start-/end-date.
# The phrase 'cancel this msg' can also reveal the end date, and the general structure surrounding this idea is 'cancel this
# msg day month year'.
# It can seem obvious to suggest an extension that wouldcompare the end dates found using 'thru' and 'cancel this msg', but
# it is redundant as the end-date found by 'cancel this message' always will dominate the end-date found by 'thru'. The impli-
# cation being that if 'cancel this msg' appears in the message, that is the default way of locating the end-date.
# The only check that should be performed, is that no start date is greater than the end date.

def ExtractDates(message,bcDate,monthdict):
    count = 0
    count_1 = 0
    
    if 'cancel this msg' in message.lower() or 'thru' in message.lower():
        count += 1
        
        if 'cancel this msg' in message.lower():
            # The statement 'cancel this msg' is used to locate the end-date by default.
            count_1 += 1
            sm = message.lower().split()
            try:
                msg = [i for i,e in enumerate(sm) if e == 'msg'][-1]
                subsm = sm[msg+1:]
                
                if subsm[0][0:2].isdigit():
                    end_day = subsm[0][0:2]
                else:
                    raise ValueError

                potMonth = subsm[1][0:3]
                if potMonth in monthdict.keys():
                    end_month = monthdict[potMonth]
                else:
                    raise ValueError
                try:
                    end_year = ''.join(('20',re.sub('[./]','',subsm[2])))
                except:
                    end_year = bcDate.year
            
                eDate = pd.DataFrame({'Year': end_year, 'Month': end_month, 'Day': end_day},index = [0])
                eDate = pd.DataFrame({'endDate' : pd.to_datetime(eDate)})
            
            except:
                eDate = pd.DataFrame({'endDate':bcDate},index=[0])
            
            # if no 'thru' is present, just use the broadcastingDate as startDate
            if 'thru' not in message.lower():
                sDate = pd.DataFrame({'startDate':bcDate},index=[0])
            else:
                thru = [i for i,e in enumerate(sm) if e == 'thru'][0]
                subThruSM = sm[thru-2:thru+3]
                # Potential start date
                potDay = subThruSM[0][0:2]
                # Potential months
                potMonth = subThruSM[1][0:3]
                potMonthAlt = subThruSM[4][0:3]
                
                # Trying to locate the start date through the word 'thru'
                if potDay.isdigit() and potMonth in monthdict.keys():
                    # Getting the month in numeric format
                    potMonth = monthdict[potMonth]
                    sDate = pd.DataFrame({'Year': bcDate.year, 'Month': potMonth, 'Day': potDay},index = [0])
                    sDate = pd.DataFrame({'startDate' : pd.to_datetime(sDate)})
                elif potDay.isdigit() and potMonthAlt in monthdict.keys():
                    # Getting the month in numeric format
                    potMonthAlt = monthdict[potMonthAlt]
                    sDate = pd.DataFrame({'Year': bcDate.year, 'Month': potMonthAlt, 'Day': potDay},index = [0])
                    sDate = pd.DataFrame({'startDate' : pd.to_datetime(sDate)})
                else:
                    sDate = pd.DataFrame({'startDate':bcDate},index=[0])
        else:
            # Trying to find start and end date from the 'thru' word
            sm = message.lower().split()
            thru = [i for i,e in enumerate(sm) if e == 'thru'][0]
            subThruSM = sm[thru-2:thru+3]
            
            # Potential start-date
            potDay = subThruSM[0][0:2]
            # Potential end-date
            potDayEnd = subThruSM[3][0:2]
            # Potential start and end months
            potMonth = subThruSM[1][0:3]
            potMonthAlt = subThruSM[4][0:3]
            
            # Locating the start date
            if potDay.isdigit() and potMonth in monthdict.keys():
                # Getting the month in numeric format
                potMonth = monthdict[potMonth]
                sDate = pd.DataFrame({'Year': bcDate.year, 'Month': potMonth, 'Day': potDay},index = [0])
                sDate = pd.DataFrame({'startDate' : pd.to_datetime(sDate)})
            elif potDay.isdigit() and potMonthAlt in monthdict.keys():
                # Getting the month in numeric format
                potMonthAlt = monthdict[potMonthAlt]
                sDate = pd.DataFrame({'Year': bcDate.year, 'Month': potMonthAlt, 'Day': potDay},index = [0])
                sDate = pd.DataFrame({'startDate' : pd.to_datetime(sDate)})
            else:
                sDate = pd.DataFrame({'startDate':bcDate},index=[0])
            # Finding the ending-date  
            if potDayEnd.isdigit() and potMonth in monthdict.keys():
                # Getting the month in numeric format
                potMonth = monthdict[potMonth]
                eDate = pd.DataFrame({'Year': bcDate.year, 'Month': potMonth, 'Day': potDayEnd},index = [0])
                eDate = pd.DataFrame({'endDate' : pd.to_datetime(eDate)})
            elif potDayEnd.isdigit() and potMonthAlt in monthdict.keys():
                # Getting the month in numeric format
                potMonthAlt = monthdict[potMonthAlt]
                eDate = pd.DataFrame({'Year': bcDate.year, 'Month': potMonthAlt, 'Day': potDayEnd},index = [0])
                eDate = pd.DataFrame({'endDate' : pd.to_datetime(eDate)})
            elif sDate['startDate'].loc[0]==bcDate:
                eDate = pd.DataFrame({'endDate' : bcDate},index=[0])
            else:
                eDate = pd.DataFrame({'endDate' : bcDate},index=[0])
    else:
        sDate = pd.DataFrame({'startDate':bcDate},index=[0])
        eDate = pd.DataFrame({'endDate':bcDate},index=[0])
    
    return sDate['startDate'].loc[0], eDate['endDate'].loc[0]


In [6]:
def handleNoChartObs_1(noChartsObsList,noChartDataFrame,finalDataFrame,messageWords,areaWords,succesfulObs):
    
    additionsWords = []
    additionsAreas = []

    minTempEle = 1000
    maxTempEle = 0
    
    numberWithoutArea = 0
    numberWithoutMessage = 0
    
    for ele in noChartDataFrame.index:

        tempobs = noChartsObsList[ele]

        # Setting up our comparisonFrame
        compare = pd.DataFrame(index=np.arange(len(tempobs)),columns=['message','area'])
        replaceindex = noChartDataFrame['index'].loc[ele]
        p = 0

        for subele in tempobs:
            # Checking if it is time to store the 'Authority' index
            if 'Aut' in subele:
                aut = p
            # Checking if it is time to store the 'Date' index
            if 'Date' in subele:
                dat = p

            messageprob = 0
            areaprob = 0

            # Cleaning up individual words
            tempele = re.split('[ ]',subele)
            tempele = [re.sub('[ :,.0-9-/]','',s) for s in tempele]
            tempele = [re.sub('\t','',s) for s in tempele]
            tempele = [re.sub('.//','',s) for s in tempele]
            tempele = [i for i in tempele if i != '']
            
            k = 0
            l = 0
            # Extracting probabilities that a element in 'tempobs' is either an area element or a message element.
            for subsubele in tempele:
                # Standard Score metric
                try:
                    messageprob += messageWords[subsubele]/succesfulObs
                    #k += 1
                except:
                    messageprob = 0

                try:
                    areaprob += areaWords[subsubele]/succesfulObs
                    #l += 1
                except:
                    areaprob = 0
            # Storing in of comparisonFrame
            
            # Probabilitiy metric
#             if k > 0:
#                 compare['message'].loc[p] = messageprob/k
#             else:
#                 compare['message'].loc[p] = 0
            
#             if l > 0:
#                 compare['area'].loc[p] = areaprob/l
#             else:
#                 compare['area'].loc[p] = 0
            # Score Metric
            compare['message'].loc[p] = messageprob
            compare['area'].loc[p] = areaprob
            p += 1
            
        # Extracting the needed indices
        areaindex = list(compare.index[compare['area']>compare['message']])
        messageindex = list(compare.index[compare['area']<=compare['message']])

        if areaindex != [] and messageindex != []:
         
            while(min(messageindex) < min(areaindex)):
                wrongindex = min(messageindex)
                messageindex.remove(wrongindex)
                areaindex.append(wrongindex)
            while(max(areaindex) in np.arange(min(messageindex),max(messageindex))):
                wrongIndex = max(areaindex)
                areaindex.remove(wrongIndex)
                messageindex.append(wrongIndex)
    
            # Removing the indices for 'Authority' and ' Date'
            if aut in areaindex:
                areaindex.remove(aut)
            elif aut in messageindex:
                messageindex.remove(aut)

            if dat in areaindex:
                areaindex.remove(dat)
            elif dat in messageindex:
                messageindex.remove(dat)
            # Correcting, if a 'Final-Cancel' is included in the message
            maxMessageIndex = max(messageindex)
            if maxMessageIndex> aut and maxMessageIndex> dat:
                messageindex.remove(maxMessageIndex)
            
            # Joining together the new area and message variable
            # Adding the individual words for new areas/messages to the frequency count
              
            area = '/'.join(tempobs[min(areaindex):max(areaindex)+1])
            tempaddAreas = re.split('[ ,./:]',area)
            tempaddAreas = [i for i in tempaddAreas if i != '' and len(re.findall('[0-9]',i)) == 0]
            tempaddAreas = list(set(tempaddAreas))

            # Counting the frequency of each word in the areas
            for a in tempaddAreas:

                if a not in areaWords.keys():
                    areaWords[a] = 1
                else:

                    areaWords[a] += 1
        
            message = ' '.join(tempobs[min(messageindex):max(messageindex)+1])
            tempaddWords = re.split('[ ,./:]',message)
            tempaddWords = [i for i in tempaddWords if i != '' and len(re.findall('[0-9]',i)) == 0]
            tempaddWords = list(set(tempaddWords))

            # Counting the frequency of each word in the messages
            for w in tempaddWords:

                if w not in messageWords.keys():
                    messageWords[w] = 1
                else:
                    messageWords[w] += 1
                    
            succesfulObs += 1
        
        elif areaindex == [] and messageindex != []:
            
            # Removing the indices for 'Authority' and ' Date'
            if aut in messageindex:
                messageindex.remove(aut)

            if dat in messageindex:
                messageindex.remove(dat)
                
            # Correcting, if a 'Final-Cancel' is included in the message
            maxMessageIndex = max(messageindex)
            if maxMessageIndex> aut and maxMessageIndex> dat:
                messageindex.remove(maxMessageIndex)
                
            message = ' '.join(tempobs[min(messageindex):max(messageindex)+1])
            tempaddWords = re.split('[ ,./:]',message)
            tempaddWords = [i for i in tempaddWords if i != '' and len(re.findall('[0-9]',i)) == 0]
            tempaddWords = list(set(tempaddWords))

            # Counting the frequency of each word in the messages
            for w in tempaddWords:

                if w not in messageWords.keys():
                    messageWords[w] = 1
                else:
                    messageWords[w] += 1
            
            numberWithoutArea += 1
        
        elif areaindex != [] and messageindex == []:
            # Removing the indices for 'Authority' and ' Date'
            if aut in areaindex:
                areaindex.remove(aut)

            if dat in areaindex:
                areaindex.remove(dat)
            
            area = '/'.join(tempobs[min(areaindex):max(areaindex)+1])
            tempaddAreas = re.split('[ ,./:]',area)
            tempaddAreas = [i for i in tempaddAreas if i != '' and len(re.findall('[0-9]',i)) == 0]
            tempaddAreas = list(set(tempaddAreas))

            # Counting the frequency of each word in the areas
            for a in tempaddAreas:

                if a not in areaWords.keys():
                    areaWords[a] = 1
                else:

                    areaWords[a] += 1
            
            numberWithoutMessage += 1
                    
        else:
            print('Error in ',replaceindex)
        
        # Throwing the newly created variables into the 'descriptionFrame' at the correct locations
        broadcastDate = broadcastingDate['BroadcastingDate'].loc[replaceindex]
        SDate, EDate = ExtractDates(message,broadcastDate,convertAlphaMonthsToNumeric)
        
        finalDataFrame['Area'].loc[replaceindex] = area
        finalDataFrame['Message'].loc[replaceindex] = message
        finalDataFrame['StartDate'].loc[replaceindex] = SDate
        finalDataFrame['EndDate'].loc[replaceindex] = EDate
        
    print('Number of observations without "Area" is: ' , numberWithoutArea)
    print('Number of observations without "Message" is: ' , numberWithoutMessage)
    print('Number of observations succesful is: \n' , succesfulObs)
    return finalDataFrame,succesfulObs,messageWords,areaWords

In [7]:
messagewords = {}
areawords = {}
nocharttemp = []
chartwords = []

# maxLenW = 0
# minLenW = 1000
maxLenA = 0
minLenA = 1000
areaLength = []
# let's make that nice-ass-looking dataframe with all our observations

for obs in indices:
    
    chartindex = 0
    
    coordinate = []

    if geometryType['GeometryType'].loc[obs] == 'GeometryCollection':
        iteration = 0
        for element in list(geometry['geometry'].loc[obs].geoms):
            if iteration == 0:
                coordinate = list(element.coords)
            else:
                coordinate.append(list(element.coords)[0])
            iteration +=1
    
        #descrip['Coordinates'].loc[elements] = coordinate
    else:
        coordinate = list(geometry['geometry'].loc[obs].coords)
        #coordinates['Coordinates'].loc[elements] = coordinate
    
    ############################################## Extracting unknown input ################################################
    
    temp = re.split('[<>]',combinedData['description'].loc[obs])
    temp = [ele for ele in temp if ele not in ['B','','br','/B',' ']]
    temp = [re.sub('[\n\t]','',ele) for ele in temp]

    ############################################## Finding the authority ################################################
    
    authority = [ele for ele in temp if 'Authority' in ele][0].split()
    authority = [i for i in authority if (re.sub('[0-9/.,Zz]','',i) != '' and i.lower() not in months and 'authority' not in i.lower())]
    authority = ' '.join(authority)
    if authority == '':
        authority = 'No Authority Listed'
        
    ############################################## Finding the date #####################################################
    
    date = [ele for ele in temp if 'Date' in ele][0].split()   
    date = [i for i in date if 'date' not in i.lower()]
        
    if date != []:
        try:
            day = date[0][0:2]
            month = date[1]
            year = date[2]
            year = ''.join('20'+year)
        except:
            date = 'Wrong Format'

        if day.isdigit() and month.lower() in months and year.isdigit():
            month = convertAlphaMonthsToNumeric[month.lower()]
            date = pd.DataFrame({'Year': year, 'Month': month, 'Day': day},index = [0])
            date = pd.DataFrame({'Date' : pd.to_datetime(date)})
            date = date['Date'].loc[0]
            #print(Date)
    else:
        date = 'No date listed'

    ############################################## Finding the chart index ################################################
       
    chartindex = [i for i in np.arange(len(temp)) if 'CHART' in temp[i]]
    
    if chartindex == []:
        chart = 'NO CHART'
        chartindex = ''
        nocharttemp.append(temp)
    else:
        
        chartindex = chartindex[0]
        chart = temp[chartindex].split()
        chart = [i for i in chart if 'chart' not in i.lower()][0]
        if obs == 0:
            print(chart)
    try:
        area = '/'.join(temp[:chartindex])
    except:
        area = ''
    
    # Extracting and combing the message
        
    message = []
    
    if chartindex != '':
        
        for i in np.arange(chartindex + 1,len(temp)):

            if '//' in temp[i]:
                message.append(temp[i])
                break
            else:
                message.append(temp[i])
                
    # Joining the message together
    message = ' '.join(message)
    
    # Getting each word in a list
    words = re.split('[ ]', message)
    areas = re.split('[ /]', area)
    charts = re.split(' ', chart)

    # Cleaning
    words = [i for i in words if i not in ["", " "] and len(re.findall('[0-9]',i)) == 0]
    words = [re.sub('[.:, \t]','',i) for i in words]
    words = [re.sub('.//','',i) for i in words]
    words = [i for i in words if i != '']
    # Getting unique values
    words = list(set(words))
    
    areas = [i for i in areas if i not in ["", " "] and len(re.findall('[0-9]',i)) == 0]
    areas = [re.sub('[.:, \t]','',i) for i in areas]
    areas = [re.sub('.//','',i) for i in areas]
    areas = [i for i in areas if i != '']   
    # Getting unique values
    areas = list(set(areas))

    # Counting the frequency of each word in the messages
    for w in words:
                    
        if w not in messagewords.keys():
            messagewords[w] = 1
        else:
            messagewords[w] += 1

    # Counting the frequency of each word in the areas
    for a in areas:
             
        if a not in areawords.keys():
            areawords[a] = 1
        else:

            areawords[a] += 1
    
    if chart != 'NO CHART':
        broadcastDate = broadcastingDate['BroadcastingDate'].loc[obs]
        startDate, endDate = ExtractDates(message,broadcastDate,convertAlphaMonthsToNumeric)
    else:
        startDate = ''
        endDate = ''

    # Adding this particular observation to the dataframe.
    descriptionFrame.loc[obs] = [area,chart,message,authority,date,startDate,endDate]#,broadcastingDate,
                                 #geometryType,geometry,coordinates]#,startDate,endDate,
    coordinates['Coordinates'].loc[obs] = coordinate
# Time to go over those observations which didn't had a 'chart' variable

nochart = descriptionFrame[descriptionFrame['Chart']=='NO CHART']
nochart = nochart.reset_index()
succesful = (descriptionFrame.shape[0]-nochart.shape[0])
print('All observations has been processed the first time, and %i observations did not have a "chart" variable to ensure a correct separation of the message.' % nochart.shape[0])
print('Those %i observations will now be processed again, using knowledge from the remaining %i observations.\n' % (nochart.shape[0],succesful))   

dataFrame, succesObs, messageWordCount,areaWordsCount = handleNoChartObs_1(nocharttemp,nochart,descriptionFrame,messagewords,areawords,succesful)
dataFrame_1 = pd.concat([dataFrame,broadcastingDate,geometryType,geometry,coordinates],axis = 1)
end_1 = time.time()
print('It took %f to process the data, from raw to cleaned' % (end_1 - start_1))

15940
All observations has been processed the first time, and 22265 observations did not have a "chart" variable to ensure a correct separation of the message.
Those 22265 observations will now be processed again, using knowledge from the remaining 31691 observations.

Number of observations without "Area" is:  98
Number of observations without "Message" is:  0
Number of observations succesful is: 
 53858
It took 1679.460135 to process the data, from raw to cleaned


In [8]:
dataFrame_1

Unnamed: 0,Area,Chart,Message,Authority,Date,StartDate,EndDate,BroadcastingDate,GeometryType,geometry,Coordinates
0,CANADA-NORTH COAST. /BEAUFORT SEA.,15940,BOTTOM SCIENTIFIC MOORING EXTENDING 3.5 METERS...,NAVAREA XVII,No date listed,2018-06-01 00:00:00,2018-06-01 00:00:00,2018-06-01,Point,POINT Z (-127.688834 70.55800000000001 0),"[(-127.688834, 70.558, 0.0)]"
1,ARCTIC. /BEAUFORT SEA.,15945,"SCIENTIFIC MOORINGS, TOP FLOAT 29 METERS, ESTA...",NAVAREA XVII,No date listed,2018-06-01 00:00:00,2018-06-01 00:00:00,2018-06-01,Point,POINT Z (-133.714 70.0585 0),"[(-133.714, 70.0585, 0.0)]"
2,ARCTIC. /BEAUFORT SEA.,15945,"SCIENTIFIC MOORINGS, TOP FLOAT 29 METERS, ESTA...",NAVAREA XVII,No date listed,2018-06-01 00:00:00,2018-06-01 00:00:00,2018-06-01,Point,POINT Z (-133.717167 70.0585 0),"[(-133.717167, 70.0585, 0.0)]"
3,ARCTIC./CHUCKCHI SEA.,16003,"SCIENTIFIC MOORING, TOP FLOAT 971 FEET, ESTABL...",CGC HEALY,No date listed,2018-06-01 00:00:00,2018-06-01 00:00:00,2018-06-01,Point,POINT Z (-158.410434 72.7996 0),"[(-158.410434, 72.7996, 0.0)]"
4,ARCTIC./CHUKCHI SEA.,16003.,DNC 27. SCIENTIFIC MOORING ESTABLISHED VICINIT...,NAVAREA XVII,No date listed,2018-06-01 00:00:00,2018-06-01 00:00:00,2018-06-01,Point,POINT Z (-158.702167 72.615167 0),"[(-158.702167, 72.615167, 0.0)]"
5,ARCTIC./CHUKCHI SEA.,16003.,DNC 27. SCIENTIFIC MOORING ESTABLISHED VICINIT...,NAVAREA XVII,No date listed,2018-06-01 00:00:00,2018-06-01 00:00:00,2018-06-01,Point,POINT Z (-158.412667 72.800167 0),"[(-158.412667, 72.800167, 0.0)]"
6,ARCTIC./BEAUFORT SEA./DNC 27.,15926.,"SCIENTIFIC MOORING, AT DEPTH 61 METERS, ESTABL...",NAVAREA XVII,2015-10-01 00:00:00,2018-06-01 00:00:00,2018-06-01 00:00:00,2018-06-01,Point,POINT Z (-139.020667 70.432334 0),"[(-139.020667, 70.432334, 0.0)]"
7,ARCTIC./BEAUFORT SEA./DNC 27.,15926.,"SCIENTIFIC MOORINGS, AT DEPTH 183 METERS AND A...",NAVAREA XVII,2015-10-02 00:00:00,2018-06-01 00:00:00,2018-06-01 00:00:00,2018-06-01,Point,POINT Z (-135.011 70.86966700000001 0),"[(-135.011, 70.869667, 0.0)]"
8,ARCTIC./BEAUFORT SEA./DNC 27.,15926.,"SCIENTIFIC MOORINGS, AT DEPTH 183 METERS AND A...",NAVAREA XVII,2015-10-02 00:00:00,2018-06-01 00:00:00,2018-06-01 00:00:00,2018-06-01,Point,POINT Z (-135.019167 70.869 0),"[(-135.019167, 70.869, 0.0)]"
9,ARCTIC./CHUKCHI SEA./DNC 27.,16005.,SUB-SURFACE SCIENTIFIC MOORINGS ESTABLISHED IN...,USCGC HEALY,2016-08-04 00:00:00,2018-06-01 00:00:00,2018-06-01 00:00:00,2018-06-01,Point,POINT Z (-161.500184 71.600117 0),"[(-161.500184, 71.600117, 0.0)]"


In [48]:
dataFrame_2 = dataFrame_1.iloc[dataFrame_1.astype(str).drop_duplicates().index]
print('There are %i observations which appears to be duplicates, which is %f percent of the total observations' % (dataFrame_1.shape[0]-dataFrame_2.shape[0],
                                                                                                                    ((dataFrame_1.shape[0]-dataFrame_2.shape[0])/dataFrame_1.shape[0])*100))

There are 148 observations which appears as duplicates, which is 0.274298 percentage of the total observations


In [9]:
dataFrame_1.to_csv('CleanedData.csv')

In [None]:
with open('WordCount.txt','w') as data:
    for k,v in messageWordCount.items():
        data.write(str(k)+'\t'+str(v)+'\n')

In [None]:
with open('AreaCount.txt','w') as data:
    for k,v in areaWordsCount.items():
        data.write(str(k)+'\t'+str(v)+'\n')

In [10]:
areaWordsSubset = {}
areaWordsInMessage = 0
for key in areaWordsCount.keys():
    if key in messageWordCount.keys():
        areaWordsInMessage += 1
        areaWordsSubset[key] = areaWordsCount[key]
#areaWordsSubset
areaWordsInMessage/len(messageWordCount.keys())

0.04354066985645933

In [None]:
testset = np.random.randint(0,combinedData.shape[0],5000)
testDataFrame = combinedData.loc[testset]
testDataFrame = testDataFrame.reset_index()#drop=True

In [None]:
testDataFrame