In [10]:
import recordlinkage 
import pandas

import numpy as np
from recordlinkage.preprocessing import clean
from recordlinkage.indexing import SortedNeighbourhoodIndex

In [3]:
d_types = {
'SkylineID': str,
'AxioID': str,
'RcaID': str,
'RentComID': str,
'ApartmentRatingsID': str,
'ZillowID': str,
'ApartmentsID': np.float64,
'Name': str,
'Latitude': np.float64,
'Longitude': np.float64,
'TrueOwner': str,
'ManagementCompany': str,
'State': str,
'City': str,
'Zip': str,
'Street': str,
'Address': str,
'Levels': np.float64,
'Status': str,
'AreaPerUnit': np.float64,
'Units': np.float64,
'YearBuilt': np.float64,
'Website': str,
'Market': str,
'SubMarket': str,
'MarketID': np.float64,
'SubMarketID': np.float64,
}

df_assets = pandas.read_csv('/home/krenous/Development/skyline/AI/propertie-latest',dtype=d_types)

# Preprocessing
### Cleaning and preprocessing data may increase your record linkage accuracy.

In [4]:
df_assets.shape

(249763, 27)

In [5]:
df_assets.dtypes

SkylineID              object
AxioID                 object
RcaID                  object
RentComID              object
ApartmentRatingsID     object
ZillowID               object
ApartmentsID          float64
Name                   object
Latitude              float64
Longitude             float64
TrueOwner              object
ManagementCompany      object
State                  object
City                   object
Zip                    object
Street                 object
Address                object
Levels                float64
Status                 object
AreaPerUnit           float64
Units                 float64
YearBuilt             float64
Website                object
Market                 object
SubMarket              object
MarketID              float64
SubMarketID           float64
dtype: object

In [6]:
df_assets.head()

Unnamed: 0,SkylineID,AxioID,RcaID,RentComID,ApartmentRatingsID,ZillowID,ApartmentsID,Name,Latitude,Longitude,...,Levels,Status,AreaPerUnit,Units,YearBuilt,Website,Market,SubMarket,MarketID,SubMarketID
0,030afa0de8f3541f96eaa22d430aaf04fde84418,,,624361.0,,,,Buena Vida at Town Center- Senior Living,10.465977,-66.765959,...,,,,,,https://www.rent.com/california/las-flores-apa...,,,,
1,d6822389e5b008be99588dcc2bd9c3f9c572ceb8,3483.0,,427024.0,,,,Coachlight Village,42.06147,-72.634206,...,2.0,Stabilized,700.0,88.0,1970.0,http://dawnhomes.com/coachlight-village/,"Springfield, MA",,831.0,
2,69c4a5fe62af11aae68746518fb6516f9b9cfd43,,,57831776.0,,,8379087.0,Sutton Place Apartments,42.09166,-72.637194,...,,,,80.0,,https://www.rent.com/massachusetts/agawam-apar...,"Springfield, MA",,831.0,
3,967650fee78b3f33d3601eca976dc0cfec04c5b9,39445.0,,,4.13253251501002e+16,,,Colonial Village,42.374919,-72.52009,...,2.0,Stabilized,566.1,200.0,1975.0,http://kaminsrealestate.com/our-rentals/rental...,"Springfield, MA",,831.0,
4,0f789a857bb192570e387a68134531da31ad6fb3,39170.0,,,,,,The Boulders,42.350488,-72.527354,...,3.0,Stabilized,843.75,256.0,1974.0,http://www.bouldersapartmenthomes.com/,"Springfield, MA",,831.0,


We should remove all ids that from the different  services

In [7]:
# Drop all uncessary columns
df_min = df_assets.drop(columns=['ManagementCompany','TrueOwner','Website','AxioID', 'RcaID','RentComID','ApartmentRatingsID','ZillowID','ApartmentsID'])

# Clean strings in the Series by removing unwanted tokens, whitespace and brackets.

df_min['Name'] = clean(df_min['Name'])
df_min['Street'] = clean(df_min['Street'])
df_min['Market'] = clean(df_min['Market'])
df_min['SubMarket'] = clean(df_min['SubMarket'])

# Pairing (http://networkslab.org/2017/07/18/2017-07-18-indexing/)

Because of the amount of data full index (index with all possible combinations of record pairs) the record space (A*A) (249763*249763-249763)/2 = 31,190,653,203 pairs 
is not feasible.

We should use generate pairs from a subset of the record space (A*B)

In order to limit the amount of patterns a blocking procedure
    was applied, which selects only record pairs that meet
    specific agreement conditions. The results of the following
    six blocking iterations were merged together:

In [91]:
indexer1 = recordlinkage.SortedNeighbourhoodIndex('Name',window=9,block_on=['State'])
indexer2 = recordlinkage.SortedNeighbourhoodIndex('Address',window=9,block_on=['State'])

indexers= [indexer1,indexer2]

pair1 = indexer1.index(df_min)
pair2 = indexer2.index(df_min)

pairs = pair1.union(pair2)
   
    
    

In [92]:
print(len(pairs))

255817


# Comparing records pairs 

In [162]:
comp = recordlinkage.Compare()

comp.string('Address', 'Address', method='levenshtein', label='Address')
comp.string('Name', 'Name', method='levenshtein', label='Name')
comp.geo('Latitude', 'Longitude','Latitude','Longitude',scale=1, label='Lat Lon')

features = comp.compute(pairs, df_min)

In [163]:
features.head(10)

Unnamed: 0,Unnamed: 1,Address,Name,Lat Lon
1,176011,0.970588,1.0,0.931781
1,176086,0.617647,0.15,0.0
2,53511,0.45,0.521739,0.0
2,176013,0.685714,1.0,0.932104
2,176491,0.52381,0.304348,0.0
3,36,0.6,0.592593,0.0
3,253,0.428571,1.0,0.0
3,53480,0.658537,1.0,0.101563
3,176541,0.414634,1.0,0.0
4,7,0.918367,1.0,1.0


In [164]:
features.describe()

Unnamed: 0,Address,Name,Lat Lon
count,255817.0,255817.0,255817.0
mean,0.54096,0.451203,0.116241
std,0.168263,0.280597,0.292971
min,0.096154,0.0,0.0
25%,0.423077,0.227273,0.0
50%,0.487805,0.428571,0.0
75%,0.608696,0.611111,0.0
max,0.984375,1.0,1.0


In [180]:
# Sum the comparison results.
sums = features.sum(axis=1) /3 
sums.sort_index(ascending=False).head(25)

249691  249720    0.484848
249690  249696    0.346230
249655  249733    0.504065
249621  249625    0.692650
249615  249725    0.349174
249584  249617    0.338730
249579  249584    0.935113
249576  249589    0.573669
249551  249555    0.796360
249550  249593    0.712901
249549  249550    0.774171
249548  249555    0.808329
        249551    0.778704
249545  249549    0.378069
249538  249615    0.402098
249536  249555    0.832464
        249551    0.801988
        249548    0.936410
249522  249621    0.270613
249514  249597    0.524823
249503  249641    0.207692
249488  249654    0.391667
249481  249482    0.960317
249471  249688    0.475000
249457  249720    0.500000
dtype: float64

In [184]:
#The last step is to decide which records is duplicate. In this example, we keep only when the close estimation is  70%:
matches = features[sums > 0.7 ]
print(len(matches))
matches.head(30)

22840


Unnamed: 0,Unnamed: 1,Address,Name,Lat Lon
1,176011,0.970588,1.0,0.931781
2,176013,0.685714,1.0,0.932104
4,7,0.918367,1.0,1.0
12,53481,0.973684,0.619048,0.953587
13,53471,0.902439,1.0,0.900627
14,176024,0.952381,0.56,0.966272
17,176031,0.945946,1.0,0.964187
19,176034,0.955556,1.0,0.958169
31,176079,0.953488,0.521739,0.918144
32,176078,0.87234,1.0,1.0


In [84]:
    pandas.DataFrame([df_min.iloc[32],df_min.iloc[176078]])

Unnamed: 0,SkylineID,Name,Latitude,Longitude,State,City,Zip,Street,Address,Levels,Status,AreaPerUnit,Units,YearBuilt,Market,SubMarket,MarketID,SubMarketID
32,6982c0c5eb0f827b9a0287ab8e14e43121cbe5d2,hathaway farms,42.334055,-72.640441,MA,Northampton,1060,73 barrett st ste 2000,"73 Barrett St #2000, Northampton, MA 01060, USA",2.0,Stabilized,975.2657,207.0,1979.0,springfield ma,,831.0,
176078,2a4a96f6074ee6704ab04c9dd7354349309cfe22,hathaway farms,42.334055,-72.640441,MA,Northampton,1060,73 barrett st,"73 Barrett St, Northampton, MA 01060, USA",2.0,,,207.0,1972.0,springfield ma,,831.0,


# test Unsupervised learning

## K-means clustering

In [120]:
compK = recordlinkage.Compare()

compK.string('Address', 'Address', method='levenshtein', label='Address')
compK.string('Name', 'Name', method='levenshtein', label='Name')
compK.exact('State', 'State', label='State')
compK.exact('City', 'City', label='City')
compK.exact('Zip', 'Zip', label='Zip')
compK.exact('MarketID', 'MarketID', label='MarketID')
compK.exact('SubMarket', 'SubMarket', label='SubMarket')
compK.geo('Latitude', 'Longitude','Latitude','Longitude',method='step', label='Lat Lon')

featuresK = comp.compute(pairs, df_min)

In [121]:
kmeans = recordlinkage.KMeansClassifier()
result_kmeans = kmeans.learn(featuresK)


91608

In [131]:
predict = kmeans.predict(featuresK,return_type='series')

In [144]:
matchesK = predict[predict ==1 ]
print( len(matchesK))
matchesK.head()

91608


1  176011    1
2  176013    1
3  36        1
   253       1
   53480     1
Name: classification, dtype: int32

We can see that the number of positive predictions 

In [150]:
df_min.iloc[3]

SkylineID      967650fee78b3f33d3601eca976dc0cfec04c5b9
Name                                   colonial village
Latitude                                        42.3749
Longitude                                      -72.5201
State                                                MA
City                                            Amherst
Zip                                               01002
Street                                 55 s pleasant st
Address        55 S Pleasant St, Amherst, MA 01002, USA
Levels                                                2
Status                                       Stabilized
AreaPerUnit                                       566.1
Units                                               200
YearBuilt                                          1975
Market                                   springfield ma
SubMarket                                           NaN
MarketID                                            831
SubMarketID                                     

In [152]:
df_min.iloc[53480]

SkylineID       5338b69227593bc19de45697add783f2024c0828
Name                                    colonial village
Latitude                                         42.3699
Longitude                                       -72.4993
State                                                 MA
City                                             Amherst
Zip                                                01002
Street                                               NaN
Address        81 Belchertown Rd, Amherst, MA 01002, USA
Levels                                               NaN
Status                                               NaN
AreaPerUnit                                          NaN
Units                                                200
YearBuilt                                           1915
Market                                    springfield ma
SubMarket                                            NaN
MarketID                                             831
SubMarketID                    

## ECMClassifier

In [160]:

# Train the classifier
ecm = recordlinkage.ECMClassifier()
result_ecm = ecm.learn((featuresK > 0.8).astype(int), return_type='series')



In [161]:
matchesEcm = result_ecm[result_ecm ==1 ]
print( len(matchesEcm))
matchesEcm.head()

29725


1   176011    1
4   7         1
12  53481     1
13  53471     1
14  176024    1
Name: classification, dtype: int64

# conclusion

 **Total Dataset : 249763**

Duplicate matches per predictions 

+ Comparaison: 22840 (9%)
+ K-means: 91608 (36%)
+ ECMClassifier: 29725 (12%)
    
By the number of matches we can easily see that The K-means with a basic configuration is not enough accurate.
ECM Classifier did well with matches that seems pretty close to the reality, also for the comparaison who the 
   